library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(apeglm)
library(ashr)
AnnotationSpecies <- "mus musculus" # Assign your species
ah <- AnnotationHub(hub=getAnnotationHubOption("URL")) # Bring annotation DB
# Filter annotation of interest
ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))
if (length(ahQuery) == 1) {
DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
DBName <- names(ahQuery)[1]
} else {
print("You don't have a valid DB")
rmarkdown::render()
}
AnnoDb <- ah[[DBName]] # Store into an OrgDb object
# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")
# Note: Annotation has to be done with not genome but transcripts
AnnoDb <- select(AnnoDb,
AnnoKey,
keytype="ENSEMBLTRANS",
columns="ENSEMBL")
# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
## ENSEMBLTRANS ENSEMBL
## 1 ENSMUST00000030010 ENSMUSG00000015243
## 2 ENSMUST00000149127 ENSMUSG00000015243
## 3 ENSMUST00000033695 ENSMUSG00000031333
## 4 ENSMUST00000140333 ENSMUSG00000031333
## 5 ENSMUST00000236391 ENSMUSG00000024030
## 6 ENSMUST00000024829 ENSMUSG00000024030
.sf files have been created from fastq data by salmon
# This code chunk needs to be written by yourself
#
# Define sample names
SampleNames <- c(paste0("WT-rep", 1:3), paste0("cKO-rep", 1:3))
# Define group level
GroupLevel <- c("WT", "cKO")
# Define contrast for DE analysis
Contrast <- c("Group", "cKO", "WT")
# Define a vector for comparing TPM vs Counts effect
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC
# Define .sf file path
sf <- c(paste0("salmon_output/",
SampleNames,
".salmon_quant/quant.sf"))
# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))
# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
Group=factor(group, levels=GroupLevel),
Path=sf)
rownames(metadata) <- SampleNames
# Explore the metadata
print(metadata)
## Sample Group Path
## WT-rep1 WT-rep1 WT salmon_output/WT-rep1.salmon_quant/quant.sf
## WT-rep2 WT-rep2 WT salmon_output/WT-rep2.salmon_quant/quant.sf
## WT-rep3 WT-rep3 WT salmon_output/WT-rep3.salmon_quant/quant.sf
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-rep1.salmon_quant/quant.sf
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-rep2.salmon_quant/quant.sf
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-rep3.salmon_quant/quant.sf
# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames
# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
countsFromAbundance="lengthScaledTPM", # Extracts TPM
ignoreTxVersion=T)
txi_counts <- tximport(sf,
type="salmon",
tx2gene=AnnoDb,
ignoreTxVersion=T)
# tpm
head(txi_tpm$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2
## ENSMUSG00000000001 6564.991768 5802.833633 5645.878976 5997.157707 6111.402833
## ENSMUSG00000000056 1575.371304 1432.872026 1309.555830 1526.721207 1658.347561
## ENSMUSG00000000058 4.982011 5.051528 4.115651 2.925755 1.002572
## ENSMUSG00000000078 3333.818365 2984.544915 2808.165368 3516.329828 3680.185701
## ENSMUSG00000000088 4490.283446 4141.351728 3882.312160 4167.144948 4205.359513
## ENSMUSG00000000093 9.362193 8.033400 7.795037 7.122818 4.059814
## cKO-rep3
## ENSMUSG00000000001 5361.224091
## ENSMUSG00000000056 1574.527535
## ENSMUSG00000000058 2.928238
## ENSMUSG00000000078 2833.866380
## ENSMUSG00000000088 3682.294112
## ENSMUSG00000000093 4.185473
dim(txi_tpm$counts)
## [1] 15022 6
# counts
head(txi_counts$counts)
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492 5822.000 5614 6040.000 6142.000 5437.000
## ENSMUSG00000000056 1637 1458.000 1395 1579.000 1608.999 1398.000
## ENSMUSG00000000058 5 5.000 4 3.000 1.000 3.000
## ENSMUSG00000000078 3344 2807.000 2828 3589.000 3731.000 2912.000
## ENSMUSG00000000088 4558 4149.929 3958 4189.921 4151.000 3623.928
## ENSMUSG00000000093 10 8.000 8 7.000 4.000 4.000
dim(txi_counts$counts)
## [1] 15022 6
# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) {
# Create a DESeq object (so-calledd dds)
des <- DESeqDataSetFromTximport(txi,
colData=metadata,
design=~Group)
# Create a vsd object (so-called vsd)
ves <- vst(des, blind=T)
# Output them as a list
return(list(dds=des, vsd=ves))
}
TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']]
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]
# TPM
TPM[[1]]
## class: DESeqDataSet
## dim: 15022 6
## metadata(1): version
## assays(1): counts
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6565 5803 5646 5997 6111 5361
## ENSMUSG00000000056 1575 1433 1310 1527 1658 1575
## ENSMUSG00000000058 5 5 4 3 1 3
## ENSMUSG00000000078 3334 2985 2808 3516 3680 2834
## ENSMUSG00000000088 4490 4141 3882 4167 4205 3682
## ENSMUSG00000000093 9 8 8 7 4 4
# Counts
Counts[[1]]
## class: DESeqDataSet
## dim: 15022 6
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(0):
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## ENSMUSG00000000001 6492 5822 5614 6040 6142 5437
## ENSMUSG00000000056 1637 1458 1395 1579 1609 1398
## ENSMUSG00000000058 5 5 4 3 1 3
## ENSMUSG00000000078 3344 2807 2828 3589 3731 2912
## ENSMUSG00000000088 4558 4150 3958 4190 4151 3624
## ENSMUSG00000000093 10 8 8 7 4 4
# TPM
TPM[[2]]
## class: DESeqTransform
## dim: 15022 6
## metadata(1): version
## assays(1): ''
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform
## dim: 15022 6
## metadata(1): version
## assays(1): ''
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
List[[1]] <- estimateSizeFactors(List[[1]])
List[[1]] <- estimateDispersions(List[[1]])
List[[1]] <- nbinomWaldTest(List[[1]])
return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts)
TPM <- DESeqPrep_fn(TPM)
sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
## WT-rep1 WT-rep2 WT-rep3 cKO-rep1 cKO-rep2 cKO-rep3
## 1.1096017 1.0008049 0.9706968 1.0240773 1.0397806 0.8919291
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead!
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
## Sample Group Path
## <factor> <factor> <character>
## WT-rep1 WT-rep1 WT salmon_output/WT-rep..
## WT-rep2 WT-rep2 WT salmon_output/WT-rep..
## WT-rep3 WT-rep3 WT salmon_output/WT-rep..
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-re..
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-re..
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-re..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
## Sample Group Path sizeFactor
## <factor> <factor> <character> <numeric>
## WT-rep1 WT-rep1 WT salmon_output/WT-rep.. 1.109602
## WT-rep2 WT-rep2 WT salmon_output/WT-rep.. 1.000805
## WT-rep3 WT-rep3 WT salmon_output/WT-rep.. 0.970697
## cKO-rep1 cKO-rep1 cKO salmon_output/cKO-re.. 1.024077
## cKO-rep2 cKO-rep2 cKO salmon_output/cKO-re.. 1.039781
## cKO-rep3 cKO-rep3 cKO salmon_output/cKO-re.. 0.891929
# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))
colnames(sizeFactor) <- 'Size_Factor'
sizeFactor <- sizeFactor %>%
rownames_to_column(var="Sample") %>%
inner_join(metadata[, 1:ncol(metadata)-1], by="Sample")
sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)
# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample,
y=Size_Factor,
fill=Group,
label=Size_Factor)) +
geom_bar(stat="identity", width=0.8) +
theme_bw() +
ggtitle("Size Factors in TPM-DESeq") +
geom_text(vjust=1.5) +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")
# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
gather(Sample, Normalization_Factor) %>%
inner_join(metadata[, 1:2], by="Sample")
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors
normFactors_plot <- ggplot(normf,
aes(x=Sample, y=Normalization_Factor)) +
geom_jitter(alpha=0.5, aes(color=Group)) +
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor),
outlier.shape=NA, alpha=0.5) +
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45,
vjust=0.5)) +
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1,
color="blue",
linetype="dashed")
# Print the normalization factor scatter plot
print(normFactors_plot)
# Print the same plot with larger y-magnification in order to observe the box plot
normFactors_plot +
ylim(0.8, 1.2)
# Assigne what to compare
GroupOfInterest <- Contrast[1]
# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {
plotPCA(inputList[[2]], # takes vsd
intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}
# Set heatmap annotation
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]
# Set a function for a correlation heatmap
QCcorrHeatmap_fn <- function(inputList, Title) {
# Extract transformed count matrix
mtx <- assay(inputList[[2]]) # takes vsd
# Calculate correlation and store in the matrix
mtx <- cor(mtx)
# Create a correlation heatmap
return(pheatmap(mtx,
annotation=HeatmapAnno,
main=paste("Sample Correlation Heatmap:",
Title)))
}
grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"),
QCPCA_fn(Counts, "QC PCA: Counts"),
nrow=2)
# TPM
QCcorrHeatmap_fn(TPM, "TPM")
# Counts
QCcorrHeatmap_fn(Counts, "Counts")
# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]])
names(desList) <- TvC
# Create a list for TPM and Counts dds
ddsList <- desList # DE without shrinkage
normal.ddsList <- desList # DE with "normal" shrinkage
ape.ddsList <- desList # DE with "apeglm" shrinkage
ash.ddsList <- desList # DE with "ashr" shrinkage
for (x in TvC) {
# Run DESeq()
ddsList[[x]] <- DESeq(desList[[x]])
print(resultsNames(ddsList[[x]]))
normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
contrast=Contrast,
type="normal")
ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="apeglm")
ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
coef=resultsNames(ddsList[[x]])[2],
type="ashr")
}
## [1] "Intercept" "Group_cKO_vs_WT"
## [1] "Intercept" "Group_cKO_vs_WT"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage
# Plot dispersion
for (x in TvC) {
plotDispEsts(ddsList[[x]],
ylab="Dispersion",
xlab="Mean of Normalized Counts",
main=paste("Dispersion of", x, "Input"))
}
# Set FDR threshold
alpha=0.1
# FDR threshold vector
FDRv=c("< 0.1", "> 0.1")
# Initialize lists of result tables
all.resList <- all.ddsList
# Set a function cleaning table
Sig.fn <- function(df, Input) {
df <- df %>%
rownames_to_column(var="Gene") %>%
mutate(FDR=ifelse(padj < 0.1 & !is.na(padj),
FDRv[1],
FDRv[2]),
Input=Input)
return(df)
}
for (i in shrinkage) {
if (i == "None") {
for (x in TvC) {
# Extract data frames from unshrunken lfc & clean data
all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]],
contrast=Contrast,
alpha=alpha)) %>%
Sig.fn(x)
}
} else {
# Extract data frames from shrunken lfc & clean data
for (x in TvC) {
all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
Sig.fn(x)
}
}
}
# Explore the results
summary(all.resList)
## Length Class Mode
## None 2 -none- list
## Normal 2 -none- list
## Apeglm 2 -none- list
## Ashr 2 -none- list
head(all.resList[[1]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5879.180418 0.017252845 0.05844104 0.2952180
## 2 ENSMUSG00000000056 1508.720447 0.207064498 0.09848660 2.1024636
## 3 ENSMUSG00000000058 3.479592 -0.922194788 1.11921063 -0.8239689
## 4 ENSMUSG00000000078 3171.662271 0.193060324 0.07734540 2.4960801
## 5 ENSMUSG00000000088 4070.772911 0.006760457 0.06174987 0.1094813
## 6 ENSMUSG00000000093 6.585522 -0.680363745 0.80291393 -0.8473682
## pvalue padj FDR Input
## 1 0.76782737 0.9757786 > 0.1 TPM
## 2 0.03551268 0.3063213 > 0.1 TPM
## 3 0.40995721 NA > 0.1 TPM
## 4 0.01255742 0.1648302 > 0.1 TPM
## 5 0.91282074 0.9936226 > 0.1 TPM
## 6 0.39678991 NA > 0.1 TPM
head(all.resList[[1]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.744236 0.017418051 0.06091612 0.2859350
## 2 ENSMUSG00000000056 1515.414428 0.206534290 0.09949552 2.0758149
## 3 ENSMUSG00000000058 3.500628 -0.965055656 1.12473916 -0.8580262
## 4 ENSMUSG00000000078 3189.467191 0.193219406 0.07898093 2.4464059
## 5 ENSMUSG00000000088 4095.812572 0.006826617 0.06412983 0.1064499
## 6 ENSMUSG00000000093 6.711966 -0.653859194 0.80313407 -0.8141345
## pvalue padj FDR Input
## 1 0.77492790 0.9770194 > 0.1 Counts
## 2 0.03791107 0.3138926 > 0.1 Counts
## 3 0.39087800 NA > 0.1 Counts
## 4 0.01442885 0.1789632 > 0.1 Counts
## 5 0.91522537 0.9928907 > 0.1 Counts
## 6 0.41556788 NA > 0.1 Counts
head(all.resList[[2]][['TPM']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5879.180418 0.016861268 0.05711473 0.2952180
## 2 ENSMUSG00000000056 1508.720447 0.194251645 0.09239231 2.1024636
## 3 ENSMUSG00000000058 3.479592 -0.096922367 0.11921763 -0.8239689
## 4 ENSMUSG00000000078 3171.662271 0.185515335 0.07432200 2.4960801
## 5 ENSMUSG00000000088 4070.772911 0.006589589 0.06018941 0.1094813
## 6 ENSMUSG00000000093 6.585522 -0.126602312 0.14998825 -0.8473682
## pvalue padj FDR Input
## 1 0.76782737 0.9757786 > 0.1 TPM
## 2 0.03551268 0.3063213 > 0.1 TPM
## 3 0.40995721 NA > 0.1 TPM
## 4 0.01255742 0.1648302 > 0.1 TPM
## 5 0.91282074 0.9936226 > 0.1 TPM
## 6 0.39678991 NA > 0.1 TPM
head(all.resList[[2]][['Counts']])
## Gene baseMean log2FoldChange lfcSE stat
## 1 ENSMUSG00000000001 5914.744236 0.016989832 0.05941861 0.2859350
## 2 ENSMUSG00000000056 1515.414428 0.193518482 0.09322728 2.0758149
## 3 ENSMUSG00000000058 3.500628 -0.100648276 0.11891890 -0.8580262
## 4 ENSMUSG00000000078 3189.467191 0.185366747 0.07577029 2.4464059
## 5 ENSMUSG00000000088 4095.812572 0.006641088 0.06238723 0.1064499
## 6 ENSMUSG00000000093 6.711966 -0.121768425 0.15012697 -0.8141345
## pvalue padj FDR Input
## 1 0.77492790 0.9770194 > 0.1 Counts
## 2 0.03791107 0.3138926 > 0.1 Counts
## 3 0.39087800 NA > 0.1 Counts
## 4 0.01442885 0.1789632 > 0.1 Counts
## 5 0.91522537 0.9928907 > 0.1 Counts
## 6 0.41556788 NA > 0.1 Counts
# Set ylim: has to adjusted by users depending on data
yl <- c(-7.5, 7.5)
# Set min log2 fold change of interest
mLog <- c(-1, 1)
# Initialize a list storing MA plots
MAList <- ddsList
# Create MA plots
for (i in shrinkage) {
both.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
MAList[[i]] <- ggplot(both.df,
aes(x=baseMean, y=log2FoldChange, color=FDR)) +geom_point() + scale_x_log10() + facet_grid(~Input) +
theme_bw() +
scale_color_manual(values=c("blue", "grey")) +
ggtitle(paste("MA plot with", i)) +
ylim(yl[1], yl[2]) +
geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red")
}
# Print MA plots
MAList
## $TPM
## class: DESeqDataSet
## dim: 15022 6
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(4): Sample Group Path sizeFactor
##
## $Counts
## class: DESeqDataSet
## dim: 15022 6
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(15022): ENSMUSG00000000001 ENSMUSG00000000056 ...
## ENSMUSG00000118638 ENSMUSG00000118653
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): WT-rep1 WT-rep2 ... cKO-rep2 cKO-rep3
## colData names(3): Sample Group Path
##
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']],
all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)
# Plot distribution of FDR
ggplot(both.df,
aes(x=padj, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") +
xlab("Adjusted P-Value (FDR)") +
ylab("Count") +
geom_vline(xintercept=alpha,
color="black",
linetype="dashed",
size=1) +
scale_x_continuous(breaks=seq(0, 1, by=0.1))
# Initialize a lfc list
lfcplotList <- all.resList
# Create lfc distribution plots
for (i in shrinkage) {
lfc.df <- rbind(all.resList[[i]][[TvC[1]]],
all.resList[[i]][[TvC[2]]])
lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]
lfc.df$Input <- factor(lfc.df$Input, levels=TvC)
lfcplotList[[i]] <- ggplot(lfc.df, # Subset rows with FDR < alpha
aes(x=log2FoldChange, color=Input)) +
geom_density(size=1, aes(y=..count..)) +
theme_bw() + ylab("Count") +
geom_vline(xintercept=c(mLog[1], mLog[2]),
color="black",
linetype="dashed",
size=1) +
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) +
xlim(-5, 5)
}
# Print the lfc plots
lfcplotList
## $None
##
## $Normal
##
## $Apeglm
##
## $Ashr
# Count number of NA genes
type=c("Zero Counts", "Outliers", "Total NA Genes")
# Create a data frame storing NA gene number
NAstat <- both.df %>%
group_by(Input) %>%
summarize(zero=sum(is.na(log2FoldChange)),
outlier=sum(is.na(pvalue) & is.na(padj))) %>%
mutate(total=zero + outlier) %>%
gather(Type, Number, -Input) %>%
mutate(Type=factor(case_when(Type == "zero" ~ type[1],
Type == "outlier" ~ type[2],
Type == "total" ~ type[3]),
levels=type))
# Plot number of NA genes
ggplot(NAstat,
aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) +
geom_bar(stat="identity", position="dodge") +
theme_bw() +
geom_text(position=position_dodge(width=1), vjust=1.5) +
ggtitle("Number of NA Genes") +
ylab("Number of Genes")
# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList
# Set a function cleaning a data frame
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}
# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}
for (i in shrinkage) {
for (x in TvC) {
rankdf <- all.resList[[i]][[x]]
fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn()
lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()
up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(desc(log2FoldChange)) %>%
Ranking.fn()
down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
arrange(log2FoldChange) %>%
Ranking.fn()
}
}
# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat
## 1: ENSMUSG00000022018 638.2601 -1.2818702 0.1145578 -11.189728
## 2: ENSMUSG00000040498 713.2059 -2.2334532 0.2151601 -10.380425
## 3: ENSMUSG00000032011 16087.6748 -1.9132468 0.1853843 -10.320433
## 4: ENSMUSG00000047180 1002.6988 -1.4158861 0.1397523 -10.131398
## 5: ENSMUSG00000020689 2308.2182 -1.5344663 0.1555018 -9.867836
## 6: ENSMUSG00000024065 1102.5550 -0.9671978 0.1170542 -8.262819
## pvalue padj FDR Input Rank
## 1: 4.578363e-29 2.152288e-25 < 0.1 TPM 1
## 2: 3.044162e-25 7.155302e-22 < 0.1 TPM 2
## 3: 5.696543e-25 8.926483e-22 < 0.1 TPM 3
## 4: 4.008763e-24 4.711299e-21 < 0.1 TPM 4
## 5: 5.738939e-23 5.395750e-20 < 0.1 TPM 5
## 6: 1.422715e-16 1.114697e-13 < 0.1 TPM 6
head(fdr.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat
## 1: ENSMUSG00000022018 641.9707 -1.280927 0.1159692 -11.045405
## 2: ENSMUSG00000040498 717.1636 -2.233531 0.2135438 -10.459359
## 3: ENSMUSG00000032011 16169.4283 -1.913100 0.1840727 -10.393178
## 4: ENSMUSG00000047180 1008.5805 -1.416004 0.1400262 -10.112417
## 5: ENSMUSG00000020689 2322.4101 -1.534429 0.1547689 -9.914329
## 6: ENSMUSG00000024065 1109.4381 -0.966510 0.1177500 -8.208153
## pvalue padj FDR Input Rank
## 1: 2.307255e-28 1.044956e-24 < 0.1 Counts 1
## 2: 1.327505e-25 3.006135e-22 < 0.1 Counts 2
## 3: 2.663268e-25 4.020647e-22 < 0.1 Counts 3
## 4: 4.866840e-24 5.510480e-21 < 0.1 Counts 4
## 5: 3.606716e-23 3.266964e-20 < 0.1 Counts 5
## 6: 2.246172e-16 1.695486e-13 < 0.1 Counts 6
head(lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000078926 9.607045 -4.750698 1.6717195 -2.841804 4.485912e-03
## 2: ENSMUSG00000035606 9.077546 -4.707677 1.3988639 -3.365357 7.644454e-04
## 3: ENSMUSG00000071713 9.742826 -4.186298 1.5134202 -2.766118 5.672806e-03
## 4: ENSMUSG00000087412 9.268108 -4.142177 1.2267197 -3.376629 7.338005e-04
## 5: ENSMUSG00000025929 39.258316 -3.504991 0.7640608 -4.587320 4.489727e-06
## 6: ENSMUSG00000061769 12.344903 3.297253 0.9505086 3.468936 5.225245e-04
## padj FDR Input Rank
## 1: 0.083024702 < 0.1 TPM 1
## 2: 0.024281473 < 0.1 TPM 2
## 3: 0.096622684 < 0.1 TPM 3
## 4: 0.023466641 < 0.1 TPM 4
## 5: 0.000570438 < 0.1 TPM 5
## 6: 0.018990240 < 0.1 TPM 6
head(lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000087412 9.244459 -3.936202 1.1691184 -3.366812 7.604245e-04
## 2: ENSMUSG00000025929 39.611464 -3.516173 0.7543463 -4.661219 3.143427e-06
## 3: ENSMUSG00000061769 12.291486 3.259101 0.9527112 3.420870 6.242122e-04
## 4: ENSMUSG00000043932 14.922102 3.162490 0.8837090 3.578655 3.453674e-04
## 5: ENSMUSG00000037129 21.804492 -2.994600 0.6584157 -4.548190 5.410923e-06
## 6: ENSMUSG00000059136 9.189472 2.688384 0.9096773 2.955316 3.123490e-03
## padj FDR Input Rank
## 1: 0.0244252678 < 0.1 Counts 1
## 2: 0.0003954606 < 0.1 Counts 2
## 3: 0.0207871846 < 0.1 Counts 3
## 4: 0.0140916134 < 0.1 Counts 4
## 5: 0.0006283608 < 0.1 Counts 5
## 6: 0.0670440156 < 0.1 Counts 6
head(up.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000061769 12.34490 3.297253 0.9505086 3.468936 0.0005225245
## 2: ENSMUSG00000043932 15.01516 3.211369 0.8834582 3.634999 0.0002779822
## 3: ENSMUSG00000059136 14.52260 2.411968 0.7542152 3.197983 0.0013839223
## 4: ENSMUSG00000082901 12.00487 2.403901 0.7930771 3.031107 0.0024365917
## 5: ENSMUSG00000026241 18.25948 2.241226 0.6781347 3.304987 0.0009498089
## 6: ENSMUSG00000051726 15.38640 2.208510 0.5997559 3.682349 0.0002310946
## padj FDR Input Rank
## 1: 0.01899024 < 0.1 TPM 1
## 2: 0.01221303 < 0.1 TPM 2
## 3: 0.03830822 < 0.1 TPM 3
## 4: 0.05727209 < 0.1 TPM 4
## 5: 0.02862212 < 0.1 TPM 5
## 6: 0.01091174 < 0.1 TPM 6
head(up.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000061769 12.291486 3.259101 0.9527112 3.420870 0.0006242122
## 2: ENSMUSG00000043932 14.922102 3.162490 0.8837090 3.578655 0.0003453674
## 3: ENSMUSG00000059136 9.189472 2.688384 0.9096773 2.955316 0.0031234902
## 4: ENSMUSG00000082901 12.087585 2.412037 0.7922746 3.044445 0.0023310977
## 5: ENSMUSG00000026241 18.124144 2.255377 0.6737004 3.347745 0.0008147201
## 6: ENSMUSG00000051726 15.505212 2.228789 0.6016818 3.704265 0.0002120042
## padj FDR Input Rank
## 1: 0.02078718 < 0.1 Counts 1
## 2: 0.01409161 < 0.1 Counts 2
## 3: 0.06704402 < 0.1 Counts 3
## 4: 0.05556601 < 0.1 Counts 4
## 5: 0.02544736 < 0.1 Counts 5
## 6: 0.01000174 < 0.1 Counts 6
head(down.lfc.rank[[1]][[1]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000078926 9.607045 -4.750698 1.6717195 -2.841804 4.485912e-03
## 2: ENSMUSG00000035606 9.077546 -4.707677 1.3988639 -3.365357 7.644454e-04
## 3: ENSMUSG00000071713 9.742826 -4.186298 1.5134202 -2.766118 5.672806e-03
## 4: ENSMUSG00000087412 9.268108 -4.142177 1.2267197 -3.376629 7.338005e-04
## 5: ENSMUSG00000025929 39.258316 -3.504991 0.7640608 -4.587320 4.489727e-06
## 6: ENSMUSG00000047501 7.042870 -3.211922 1.0593659 -3.031929 2.429966e-03
## padj FDR Input Rank
## 1: 0.083024702 < 0.1 TPM 1
## 2: 0.024281473 < 0.1 TPM 2
## 3: 0.096622684 < 0.1 TPM 3
## 4: 0.023466641 < 0.1 TPM 4
## 5: 0.000570438 < 0.1 TPM 5
## 6: 0.057272088 < 0.1 TPM 6
head(down.lfc.rank[[1]][[2]])
## Gene baseMean log2FoldChange lfcSE stat pvalue
## 1: ENSMUSG00000087412 9.244459 -3.936202 1.1691184 -3.366812 7.604245e-04
## 2: ENSMUSG00000025929 39.611464 -3.516173 0.7543463 -4.661219 3.143427e-06
## 3: ENSMUSG00000037129 21.804492 -2.994600 0.6584157 -4.548190 5.410923e-06
## 4: ENSMUSG00000070368 43.604624 -2.650362 0.6872414 -3.856522 1.150116e-04
## 5: ENSMUSG00000040724 25.325681 -2.519621 0.6750384 -3.732560 1.895438e-04
## 6: ENSMUSG00000030217 25.976658 -2.507083 0.5278218 -4.749866 2.035511e-06
## padj FDR Input Rank
## 1: 0.0244252678 < 0.1 Counts 1
## 2: 0.0003954606 < 0.1 Counts 2
## 3: 0.0006283608 < 0.1 Counts 3
## 4: 0.0066780461 < 0.1 Counts 4
## 5: 0.0093309103 < 0.1 Counts 5
## 6: 0.0002880885 < 0.1 Counts 6
# Set a function rebuilding DE tables with gene ranks
rankdiff.fn <- function(List){
# Select columns and join the data frames by gene
full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)],
List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)],
by="Gene") %>%
# Add columns assining gene expression levels, rank differences, and mean ranks
mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
RankDiff=Rank.x-Rank.y,
MeanRank=(Rank.x+Rank.y)/2)
}
# Set a function adding Shrinkage column
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)}
# Set a function rbinding multiple data frames
rbinds.fn <- function(List) {rbind(List[[1]],
List[[2]],
List[[3]],
List[[4]])}
# Calculate and plot rank difference
for (i in shrinkage) {
# Calculate rank difference
fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i)
}
# Create combined data frames across the shrinkages
fdr.rank.df <- rbinds.fn(fdr.rank)
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)
# Explore the ranking outputs
head(fdr.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000022018 TPM 1 638.2601 Counts 1 641.9707
## 2: ENSMUSG00000040498 TPM 2 713.2059 Counts 2 717.1636
## 3: ENSMUSG00000032011 TPM 3 16087.6748 Counts 3 16169.4283
## 4: ENSMUSG00000047180 TPM 4 1002.6988 Counts 4 1008.5805
## 5: ENSMUSG00000020689 TPM 5 2308.2182 Counts 5 2322.4101
## 6: ENSMUSG00000024065 TPM 6 1102.5550 Counts 6 1109.4381
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 6.866147 0 1 None
## 2: 6.977083 0 2 None
## 3: 10.092966 0 3 None
## 4: 7.317869 0 4 None
## 5: 8.151744 0 5 None
## 6: 7.412929 0 6 None
head(lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926 TPM 1 9.607045 <NA> NA NA
## 2: ENSMUSG00000035606 TPM 2 9.077546 <NA> NA NA
## 3: ENSMUSG00000071713 TPM 3 9.742826 <NA> NA NA
## 4: ENSMUSG00000087412 TPM 4 9.268108 Counts 1 9.244459
## 5: ENSMUSG00000025929 TPM 5 39.258316 Counts 2 39.611464
## 6: ENSMUSG00000061769 TPM 6 12.344903 Counts 3 12.291486
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: NA NA NA None
## 2: NA NA NA None
## 3: NA NA NA None
## 4: 2.631193 3 2.5 None
## 5: 4.078622 3 3.5 None
## 6: 2.917265 3 4.5 None
head(up.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000061769 TPM 1 12.34490 Counts 1 12.291486
## 2: ENSMUSG00000043932 TPM 2 15.01516 Counts 2 14.922102
## 3: ENSMUSG00000059136 TPM 3 14.52260 Counts 3 9.189472
## 4: ENSMUSG00000082901 TPM 4 12.00487 Counts 4 12.087585
## 5: ENSMUSG00000026241 TPM 5 18.25948 Counts 5 18.124144
## 6: ENSMUSG00000051726 TPM 6 15.38640 Counts 6 15.505212
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: 2.917265 0 1 None
## 2: 3.112457 0 2 None
## 3: 2.950596 0 3 None
## 4: 2.893071 0 4 None
## 5: 3.307676 0 5 None
## 6: 3.141520 0 6 None
head(down.lfc.rank.df)
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926 TPM 1 9.607045 <NA> NA NA
## 2: ENSMUSG00000035606 TPM 2 9.077546 <NA> NA NA
## 3: ENSMUSG00000071713 TPM 3 9.742826 <NA> NA NA
## 4: ENSMUSG00000087412 TPM 4 9.268108 Counts 1 9.244459
## 5: ENSMUSG00000025929 TPM 5 39.258316 Counts 2 39.611464
## 6: ENSMUSG00000047501 TPM 6 7.042870 <NA> NA NA
## logMeanExpression RankDiff MeanRank Shrinkage
## 1: NA NA NA None
## 2: NA NA NA None
## 3: NA NA NA None
## 4: 2.631193 3 2.5 None
## 5: 4.078622 3 3.5 None
## 6: NA NA NA None
# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df,
aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red")
}
# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")
ranking.plot.fn(lfc.rank.df, "Log2FoldChange")
ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")
ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")
# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {
df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)
ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) +
geom_point(alpha=0.5) +
theme_bw() +
facet_grid(~Shrinkage) +
theme(strip.text.x=element_text(size=10)) +
ylab("Rank Difference (TPM - Count)") +
ggtitle(paste("Rank Difference Inputs (TPM - Count):", rankedby)) +
geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
ylim(-ylim, ylim)
}
# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 150)
rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 200)
rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 150)
rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 160)
# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}
# Create a list storing rankdiff data frames
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
unshr.rankdiff.fn(lfc.rank.df),
unshr.rankdiff.fn(up.lfc.rank.df),
unshr.rankdiff.fn(down.lfc.rank.df))
# Assine result column as a factor with levels
rankdiff.levels <- c("FDR",
"log2FoldChange",
"log2FoldChange.Increase",
"log2FoldChange.Decrease")
# Name the list
names(rankList) <- rankdiff.levels
# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff),
log2FoldChange=abs(rankList[[2]]$RankDiff),
log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff)
rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)
# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) +
geom_boxplot(alpha=0.5, fill="grey", color="black") +
theme_bw() +
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage") +
ylab("Absolute Rank Difference")
# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>%
group_by(ENSEMBL) %>%
summarize(num.trans=n())
# Set a function adding the number of transcripts by gene id
add.ntrans.fn <- function(df) {
left_join(df, AnnoDb.ntrans, by=c("Gene"="ENSEMBL"))}
# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {
rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}
# Explore the edited data frames
summary(rankList)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
head(rankList[[1]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000022018 TPM 1 638.2601 Counts 1 641.9707
## 2: ENSMUSG00000040498 TPM 2 713.2059 Counts 2 717.1636
## 3: ENSMUSG00000032011 TPM 3 16087.6748 Counts 3 16169.4283
## 4: ENSMUSG00000047180 TPM 4 1002.6988 Counts 4 1008.5805
## 5: ENSMUSG00000020689 TPM 5 2308.2182 Counts 5 2322.4101
## 6: ENSMUSG00000024065 TPM 6 1102.5550 Counts 6 1109.4381
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 6.866147 0 1 None 1
## 2: 6.977083 0 2 None 4
## 3: 10.092966 0 3 None 6
## 4: 7.317869 0 4 None 2
## 5: 8.151744 0 5 None 2
## 6: 7.412929 0 6 None 2
head(rankList[[2]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926 TPM 1 9.607045 <NA> NA NA
## 2: ENSMUSG00000035606 TPM 2 9.077546 <NA> NA NA
## 3: ENSMUSG00000071713 TPM 3 9.742826 <NA> NA NA
## 4: ENSMUSG00000087412 TPM 4 9.268108 Counts 1 9.244459
## 5: ENSMUSG00000025929 TPM 5 39.258316 Counts 2 39.611464
## 6: ENSMUSG00000061769 TPM 6 12.344903 Counts 3 12.291486
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: NA NA NA None 5
## 2: NA NA NA None 2
## 3: NA NA NA None 8
## 4: 2.631193 3 2.5 None 2
## 5: 4.078622 3 3.5 None 1
## 6: 2.917265 3 4.5 None 1
head(rankList[[3]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000061769 TPM 1 12.34490 Counts 1 12.291486
## 2: ENSMUSG00000043932 TPM 2 15.01516 Counts 2 14.922102
## 3: ENSMUSG00000059136 TPM 3 14.52260 Counts 3 9.189472
## 4: ENSMUSG00000082901 TPM 4 12.00487 Counts 4 12.087585
## 5: ENSMUSG00000026241 TPM 5 18.25948 Counts 5 18.124144
## 6: ENSMUSG00000051726 TPM 6 15.38640 Counts 6 15.505212
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: 2.917265 0 1 None 1
## 2: 3.112457 0 2 None 1
## 3: 2.950596 0 3 None 4
## 4: 2.893071 0 4 None 1
## 5: 3.307676 0 5 None 1
## 6: 3.141520 0 6 None 1
head(rankList[[4]])
## Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSMUSG00000078926 TPM 1 9.607045 <NA> NA NA
## 2: ENSMUSG00000035606 TPM 2 9.077546 <NA> NA NA
## 3: ENSMUSG00000071713 TPM 3 9.742826 <NA> NA NA
## 4: ENSMUSG00000087412 TPM 4 9.268108 Counts 1 9.244459
## 5: ENSMUSG00000025929 TPM 5 39.258316 Counts 2 39.611464
## 6: ENSMUSG00000047501 TPM 6 7.042870 <NA> NA NA
## logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1: NA NA NA None 5
## 2: NA NA NA None 2
## 3: NA NA NA None 8
## 4: 2.631193 3 2.5 None 2
## 5: 4.078622 3 3.5 None 1
## 6: NA NA NA None 1
# Set a function plotting rank difference vs number of transcripts
rank.ntrans.plot.fn <- function(df, title) {
ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) +
geom_jitter(alpha=0.5) +
theme_bw() +
ggtitle(paste("Rank Difference vs Number of Transcripts in", title)) +
xlab("Number of Transcripts") +
ylab("Absolute Rank Difference (TPM - Counts)") + scale_color_gradient(low="blue", high="red")
}
# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")
rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")
rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")
rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")
# Initialize a list storing rankdiff genes
large.rankdiff <- rankList
# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)
names(rankdiff.thr) <- rankdiff.levels
for (x in rankdiff.levels) {
# Filter out observations below the rankdiff thresholds
large.rankdiff[[x]] <- subset(rankList[[x]],
abs(RankDiff) > rankdiff.thr[x])
}
# Explore the filtered genes
summary(large.rankdiff)
## Length Class Mode
## FDR 12 data.table list
## log2FoldChange 12 data.table list
## log2FoldChange.Increase 12 data.table list
## log2FoldChange.Decrease 12 data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 34 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1] 3 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1] 0 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1] 0 12
# Set a function cleaning data to generate upset plots
upset.input.fn <- function(df) {
df <- df %>%
# Filter genes with valid padj
filter(!is.na(padj)) %>%
mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes?
Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""), # What are downregulated genes?
Unchanged=ifelse(FDR == FDRv[2], Gene, ""), # What are unchanged genes?
TPM_Input=ifelse(Input == "TPM", Gene, ""), # What are the genes from TPM input?
Counts_Input=ifelse(Input == "Counts", Gene, "")) # What are the genes from Counts input?
# Create a list storing groups of interest
upsetInput <- list(Up=df$Up,
Down=df$Down,
Unchanged=df$Unchanged,
TPM_Input=df$TPM,
Counts_Input=df$Counts)
return(upsetInput)
}
upsetList <- upset.input.fn(both.df)
# Create the upset plot
upset(fromList(upsetList),
sets=names(upsetList), # What group to display
sets.x.label="Number of Genes per Group",
order.by="freq",
point.size=3,
sets.bar.color=c("red", "red","blue", "blue", "blue"),
text.scale = 1.5, number.angles=30)
sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
##
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
##
## locale:
## [1] LC_CTYPE=en_US.UTF-8 LC_NUMERIC=C
## [3] LC_TIME=en_US.UTF-8 LC_COLLATE=en_US.UTF-8
## [5] LC_MONETARY=en_US.UTF-8 LC_MESSAGES=en_US.UTF-8
## [7] LC_PAPER=en_US.UTF-8 LC_NAME=C
## [9] LC_ADDRESS=C LC_TELEPHONE=C
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C
##
## attached base packages:
## [1] stats4 parallel stats graphics grDevices utils datasets
## [8] methods base
##
## other attached packages:
## [1] AnnotationDbi_1.52.0 ashr_2.2-47
## [3] apeglm_1.12.0 UpSetR_1.4.0
## [5] gridExtra_2.3 pheatmap_1.0.12
## [7] DESeq2_1.30.0 SummarizedExperiment_1.20.0
## [9] Biobase_2.50.0 MatrixGenerics_1.2.0
## [11] matrixStats_0.57.0 GenomicRanges_1.42.0
## [13] GenomeInfoDb_1.26.2 IRanges_2.24.0
## [15] S4Vectors_0.28.1 tximport_1.18.0
## [17] forcats_0.5.0 stringr_1.4.0
## [19] dplyr_1.0.2 purrr_0.3.4
## [21] readr_1.4.0 tidyr_1.1.2
## [23] tibble_3.0.4 ggplot2_3.3.2
## [25] tidyverse_1.3.0 AnnotationHub_2.22.0
## [27] BiocFileCache_1.14.0 dbplyr_2.0.0
## [29] BiocGenerics_0.36.0 rmarkdown_2.5
## [31] data.table_1.13.4
##
## loaded via a namespace (and not attached):
## [1] colorspace_2.0-0 ellipsis_0.3.1
## [3] XVector_0.30.0 fs_1.5.0
## [5] rstudioapi_0.13 farver_2.0.3
## [7] bit64_4.0.5 mvtnorm_1.1-1
## [9] interactiveDisplayBase_1.28.0 fansi_0.4.1
## [11] lubridate_1.7.9.2 xml2_1.3.2
## [13] splines_4.0.3 geneplotter_1.68.0
## [15] knitr_1.30 jsonlite_1.7.2
## [17] broom_0.7.2 annotate_1.68.0
## [19] shiny_1.5.0 BiocManager_1.30.10
## [21] compiler_4.0.3 httr_1.4.2
## [23] backports_1.2.1 assertthat_0.2.1
## [25] Matrix_1.2-18 fastmap_1.0.1
## [27] cli_2.2.0 later_1.1.0.1
## [29] htmltools_0.5.0 tools_4.0.3
## [31] coda_0.19-4 gtable_0.3.0
## [33] glue_1.4.2 GenomeInfoDbData_1.2.4
## [35] rappdirs_0.3.1 Rcpp_1.0.5
## [37] bbmle_1.0.23.1 cellranger_1.1.0
## [39] vctrs_0.3.5 xfun_0.19
## [41] ps_1.5.0 rvest_0.3.6
## [43] irlba_2.3.3 mime_0.9
## [45] lifecycle_0.2.0 XML_3.99-0.5
## [47] MASS_7.3-53 zlibbioc_1.36.0
## [49] scales_1.1.1 hms_0.5.3
## [51] promises_1.1.1 RColorBrewer_1.1-2
## [53] yaml_2.2.1 curl_4.3
## [55] memoise_1.1.0 emdbook_1.3.12
## [57] bdsmatrix_1.3-4 SQUAREM_2020.5
## [59] stringi_1.5.3 RSQLite_2.2.1
## [61] BiocVersion_3.12.0 genefilter_1.72.0
## [63] BiocParallel_1.24.1 truncnorm_1.0-8
## [65] rlang_0.4.9 pkgconfig_2.0.3
## [67] bitops_1.0-6 invgamma_1.1
## [69] evaluate_0.14 lattice_0.20-41
## [71] labeling_0.4.2 bit_4.0.4
## [73] tidyselect_1.1.0 plyr_1.8.6
## [75] magrittr_2.0.1 R6_2.5.0
## [77] generics_0.1.0 DelayedArray_0.16.0
## [79] DBI_1.1.0 pillar_1.4.7
## [81] haven_2.3.1 withr_2.3.0
## [83] mixsqp_0.3-43 survival_3.2-7
## [85] RCurl_1.98-1.2 modelr_0.1.8
## [87] crayon_1.3.4 locfit_1.5-9.4
## [89] grid_4.0.3 readxl_1.3.1
## [91] blob_1.2.1 reprex_0.3.0
## [93] digest_0.6.27 xtable_1.8-4
## [95] numDeriv_2016.8-1.1 httpuv_1.5.4
## [97] munsell_0.5.0